# !diagnostics off
#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(Rtsne)
library(ClusterR)
library(DESeq2)
library(expss)
library(knitr)
Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd)
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
genes_info = DE_info %>% data.frame
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Update genes_info with SFARI and Neuronal information
genes_info = genes_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'Others', `gene-score`)) %>%
distinct(ID, .keep_all = TRUE) %>% left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='Others' & Neuronal==1, 'Neuronal', `gene-score`),
significant=padj<0.05 & !is.na(padj)) %>%
mutate(Group = factor(ifelse(gene.score %in% c('Neuronal','Others'), gene.score, 'SFARI'),
levels = c('SFARI', 'Neuronal', 'Others')))
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
rm(GO_annotations, DE_info)
cat(paste0('There are ', length(unique(SFARI_genes$`gene-symbol`)), ' genes with a SFARI score'))
## There are 912 genes with a SFARI score
There are 912 genes with a SFARI score. but to map them to gene expression mapa we had to map the gene names to their corresponding ensembl IDs
cat(paste0('There are ', nrow(SFARI_genes), ' Ensembl IDs corresponding to the ',
length(unique(SFARI_genes$`gene-symbol`)),' genes in the SFARI Gene dataset'))
## There are 1116 Ensembl IDs corresponding to the 912 genes in the SFARI Gene dataset
Since a gene can have more than one ensembl ID, there were some one-to-many mappings between a gene name and ensembl IDs, so that’s why we ended up with 1090 rows in the SFARI_genes dataset.
The details about how the genes were annotated with their Ensembl IDs can be found in 20_02_06_get_ensembl_ids.html
cat(paste0('There are ', sum(is.na(SFARI_genes$`gene-score`)) ,
' genes in the SFARI list without a score, of which ',
sum(is.na(SFARI_genes$`gene-score`) & SFARI_genes$syndromic==0),
' don\'t have syndromic tag either'))
## There are 87 genes in the SFARI list without a score, of which 0 don't have syndromic tag either
cat(paste0('There are ',sum(SFARI_genes$ID %in% rownames(datExpr)),' SFARI Genes in the expression dataset (~',
round(100*mean(SFARI_genes$ID %in% rownames(datExpr))),'%)'))
## There are 864 SFARI Genes in the expression dataset (~77%)
cat(paste0('Of these, only ', sum(genes_info$`gene-score`!='Others'), ' have an assigned score'))
## Of these, only 789 have an assigned score
From now on, we’re only going to focus on the 789 genes with a score
Gene count by SFARI score:
table_info = genes_info %>% apply_labels(`gene-score` = 'SFARI Gene Score', syndromic = 'Syndromic Tag',
Neuronal = 'Neuronal Function', gene.score = 'Gene Score') %>%
mutate(syndromic = as.logical(syndromic), Neuronal = as.logical(Neuronal))
cro(table_info$`gene-score`)
| #Total | |
|---|---|
| SFARI Gene Score | |
| 1 | 144 |
| 2 | 205 |
| 3 | 440 |
| Others | 15358 |
| #Total cases | 16147 |
Gene count by Syndromic tag:
cro(table_info$syndromic)
| #Total | |
|---|---|
| Syndromic Tag | |
| FALSE | 748 |
| TRUE | 116 |
| #Total cases | 864 |
GO Neuronal annotations:
cat(glue(sum(GO_neuronal$ID %in% rownames(datExpr)), ' genes have neuronal-related annotations'))
## 1094 genes have neuronal-related annotations
cat(glue(sum(SFARI_genes$ID %in% GO_neuronal$ID),' of these genes have a SFARI score'))
## 179 of these genes have a SFARI score
cro(table_info$gene.score[genes_info$`gene-score` %in% c('1','2','3','4','5','6')],
list(table_info$Neuronal[genes_info$`gene-score` %in% c('1','2','3','4','5','6')], total()))
| Neuronal Function | #Total | |||
|---|---|---|---|---|
| FALSE | TRUE | |||
| Gene Score | ||||
| 1 | 105 | 39 | 144 | |
| 2 | 161 | 44 | 205 | |
| 3 | 365 | 75 | 440 | |
| #Total cases | 631 | 158 | 789 | |
rm(table_info)
Larger mean expression than the other two groups, smaller SD than Neuronal genes
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(genes_info, by='ID') %>%
mutate(Group = factor(ifelse(gene.score %in% c('Neuronal','Others'), gene.score, 'SFARI'),
levels = c('SFARI', 'Neuronal', 'Others')))
p1 = ggplotly(plot_data %>% ggplot(aes(Group, MeanExpr, fill=Group)) + geom_boxplot() +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(Group, SDExpr, fill=Group)) + geom_boxplot() +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) comparison between SFARI Genes and the rest of the dataset') +
theme(legend.position='none'))
plotly::subplot(p1, p2, nrows=1)
p1 = plot_data %>% ggplot(aes(Group, MeanExpr, color = Group)) +
stat_summary(fun = mean, geom = 'point', size = 3) +
stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5) +
stat_compare_means(comparisons = list(c('SFARI','Neuronal'), c('SFARI','Others')),
label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE), label.y = c(12,13)) +
scale_colour_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + xlab('') +
ylab('Mean Level of Expression') + ggtitle('Mean Expression') + theme_minimal() +
theme(legend.position = 'none')
p2 = plot_data %>% ggplot(aes(Group, SDExpr, color = Group)) +
stat_summary(fun = mean, geom = 'point', size = 3) +
stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5) +
stat_compare_means(comparisons = list(c('SFARI','Neuronal'), c('SFARI','Others')),
label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE), label.y = c(0.35, 0.4)) +
scale_colour_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + xlab('') +
ylab('SD of Level of Expression') + ggtitle('Standard Deviation') + theme_minimal() +
theme(legend.position = 'none')
grid.arrange(p1, p2, nrow=1, top = 'Comparison Between SFARI Genes and the rest of the dataset')
rm(plot_data, p1, p2)
Proportion of over- and under-expressed genes is very similar between groups: approximately half
genes_info %>% mutate(direction = ifelse(log2FoldChange>0, 'over-expressed', 'under-expressed')) %>%
group_by(Group, direction) %>% tally(name = 'over_expr') %>%
filter(direction == 'over-expressed') %>% ungroup %>%
left_join(genes_info %>% group_by(Group) %>% tally(name = 'Total'), by = 'Group') %>% ungroup %>%
mutate('prop_over_expr' = round(over_expr/Total,3)) %>% dplyr::select(-direction) %>% kable
| Group | over_expr | Total | prop_over_expr |
|---|---|---|---|
| SFARI | 380 | 789 | 0.482 |
| Neuronal | 461 | 936 | 0.493 |
| Others | 7419 | 14422 | 0.514 |
Smaller lFC magnitude than Neuronal genes and similar but slightly lower than the rest of the genes
ggplotly(genes_info %>% ggplot(aes(x=Group, y=abs(log2FoldChange), fill=Group)) + geom_boxplot() +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + ylab('logFoldChange Magnitude') +
xlab('Group') + theme_minimal() + theme(legend.position='none'))
truncated_sd = function(x) data.frame('y' = mean(x), 'ymin' = max(mean(x)-sd(x),0), 'ymax' = mean(x)+sd(x))
genes_info %>% ggplot(aes(Group, abs(log2FoldChange), color = Group)) +
stat_summary(fun = mean, geom = 'point', size = 3) +
stat_summary(fun.data = function(x) truncated_sd(x), geom = 'errorbar', width = 0.5) +
stat_compare_means(comparisons = list(c('SFARI','Neuronal'), c('SFARI','Others')),
label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE),
label.y = c(0.4,0.5)) +
scale_colour_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + xlab('') +
ylab('LFC Magnitude') + theme_minimal() + theme(legend.position = 'none')
rm(truncated_sd)
plot_data = genes_info %>% dplyr::select(Group, log2FoldChange) %>%
mutate(quant = cut(log2FoldChange, breaks = quantile(log2FoldChange, probs = seq(0,1,0.05)) %>%
as.vector, labels = FALSE),
value_range = cut(log2FoldChange, breaks = quantile(log2FoldChange, probs=seq(0,1,0.05)) %>%
as.vector)) %>%
filter(Group == 'SFARI') %>% group_by(quant, value_range) %>% tally %>% ungroup %>%
left_join(genes_info %>% dplyr::select(Group, log2FoldChange) %>%
mutate(quant = cut(log2FoldChange, breaks = quantile(log2FoldChange,
probs = seq(0,1,0.05)) %>% as.vector, labels = FALSE)) %>%
group_by(quant) %>% tally(name = 'tot') %>% ungroup) %>% mutate(p = 100*n/tot)
ggplotly(plot_data %>% ggplot(aes(quant, p)) + geom_smooth(color = 'gray', alpha = 0.1) +
geom_bar(stat = 'identity', fill = '#00A4F7', aes(id = value_range)) +
geom_hline(yintercept = 100*mean(genes_info$Group == 'SFARI'), color = 'gray', linetype = 'dotted') +
xlab('Log Fold Change Quantiles') + ylab('% of SFARI Genes in each Quantile') + ggtitle('
Distribution of SFARI Genes in LFC Quantiles') + theme_minimal())
cat('LFC values for each quantile:')
## LFC values for each quantile:
quants = data.frame('Quantile' = 1:20, 'LFC Range' = cut(genes_info$log2FoldChange,
breaks = quantile(genes_info$log2FoldChange, probs = seq(0,1,0.05)) %>% as.vector) %>% table %>% names)
quants %>% kable
| Quantile | LFC.Range |
|---|---|
| 1 | (-2.04,-0.229] |
| 2 | (-0.229,-0.164] |
| 3 | (-0.164,-0.125] |
| 4 | (-0.125,-0.0988] |
| 5 | (-0.0988,-0.0777] |
| 6 | (-0.0777,-0.0598] |
| 7 | (-0.0598,-0.0427] |
| 8 | (-0.0427,-0.0261] |
| 9 | (-0.0261,-0.0114] |
| 10 | (-0.0114,0.00376] |
| 11 | (0.00376,0.0188] |
| 12 | (0.0188,0.0352] |
| 13 | (0.0352,0.052] |
| 14 | (0.052,0.0731] |
| 15 | (0.0731,0.097] |
| 16 | (0.097,0.129] |
| 17 | (0.129,0.171] |
| 18 | (0.171,0.238] |
| 19 | (0.238,0.373] |
| 20 | (0.373,1.96] |
rm(quants)
Lower proportion of DE genes than genes with Neuronal annotation but higher than the rest of the genes
genes_info %>% group_by(Group, significant) %>% tally(name = 'DE') %>% filter(significant) %>% ungroup %>%
left_join(genes_info %>% group_by(Group) %>% tally(name = 'Total'), by = 'Group') %>% ungroup %>%
mutate('prop_DE' = round(DE/Total,2)) %>% dplyr::select(-significant) %>% kable
| Group | DE | Total | prop_DE |
|---|---|---|---|
| SFARI | 244 | 789 | 0.31 |
| Neuronal | 347 | 936 | 0.37 |
| Others | 3908 | 14422 | 0.27 |
SFARI Genes have consistently a lower percentage of DE genes than the Genes with Neuronal annotations and a very similar level to the rest of the genes
The SFARI Genes have less DE genes than when using the original SFARI genes (makes sense since we are losing all genes with SFARI scores 5 and 6, which were the ones with the largest lFC)
lfc_list = seq(1, 1.2, 0.01)
all_counts = data.frame('group'='All', 'n'=as.character(nrow(genes_info)))
Others_counts = data.frame('group'='Others', n=as.character(sum(genes_info$Group == 'Others')))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(genes_info$Neuronal)))
lfc_counts_all = genes_info %>% filter(Group == 'SFARI') %>% tally %>%
mutate('group'='SFARI', 'n'=as.character(n)) %>%
dplyr::select(group, n) %>%
bind_rows(Neuronal_counts, Others_counts, all_counts) %>%
mutate('lfc'=-1) %>% dplyr::select(lfc, group, n)
for(lfc in lfc_list){
# Recalculate genes_info with the new threshold (p-values change)
DE_genes = results(dds, lfcThreshold=log2(lfc), altHypothesis='greaterAbs') %>% data.frame %>%
mutate('ID' = rownames(.)) %>%
left_join(genes_info %>% dplyr::select(ID, Neuronal, gene.score, Group), by = 'ID') %>%
filter(padj<0.05 & abs(log2FoldChange)>log2(lfc))
# Calculate counts by groups
all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_genes)))
Others_counts = data.frame('group'='Others', n=as.character(sum(DE_genes$Group == 'Others')))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_genes$Neuronal)))
lfc_counts = DE_genes %>% filter(Group == 'SFARI') %>% tally %>%
mutate('group'='SFARI', 'n'=as.character(n)) %>%
bind_rows(Neuronal_counts, Others_counts, all_counts) %>%
mutate('lfc'=lfc) %>% dplyr::select(lfc, group, n)
# Update lfc_counts_all
lfc_counts_all = lfc_counts_all %>% bind_rows(lfc_counts)
}
# Add missing entries with 0s
lfc_counts_all = expand.grid('group'=unique(lfc_counts_all$group), 'lfc'=unique(lfc_counts_all$lfc)) %>%
left_join(lfc_counts_all, by=c('group','lfc')) %>% replace(is.na(.), 0)
# Calculate percentage of each group remaining
tot_counts = genes_info %>% filter(Group == 'SFARI') %>% tally() %>%
mutate('group'='SFARI', 'tot'=n) %>% dplyr::select(group, tot) %>%
bind_rows(data.frame('group'='Neuronal', 'tot'=sum(genes_info$Neuronal)),
data.frame('group' = 'Others', 'tot' = sum(genes_info$Group == 'Others')),
data.frame('group'='All', 'tot'=nrow(genes_info)))
lfc_counts_all = lfc_counts_all %>% filter(lfc!=-1) %>% #, group!='Others') %>%
left_join(tot_counts, by='group') %>% mutate('perc'=round(100*as.numeric(n)/tot,2))
# Plot change of number of genes
ggplotly(lfc_counts_all %>% filter(group != 'All') %>%
mutate(group = factor(group, levels = c('SFARI', 'Neuronal', 'Others'))) %>%
ggplot(aes(lfc, perc, color=group)) + geom_point(aes(id=n)) + geom_line() +
scale_color_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) +
ylab('% of Differentially Expressed Genes') + xlab('Fold Change') +
ggtitle('Effect of filtering thresholds in SFARI Genes') + theme_minimal())
The higher the SFARI score, the higher the mean expression of the gene: This pattern is quite strong and it doesn’t have any biological interpretation, so it’s probably bias in the SFARI score assignment
The higher the SFARI score, the lower the standard deviation: This pattern is not as strong, but it is weird because the data was originally heteroscedastic with a positive relation between mean and variance, so the fact that the relation now seems to have reversed could mean that the vst normalisation ended up affecting the highly expressed genes more than it should have when trying to correct their higher variance
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(genes_info, by='ID')
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
plotly::subplot(p1, p2, nrows=1)
comparisons = list(c('1','Neuronal'), c('2','Neuronal'), c('3','Neuronal'), c('1','Others'), c('2','Others'),
c('3','Others'), c('1','2'), c('2','3'), c('Neuronal', 'Others'))
p1 = plot_data %>% ggplot(aes(gene.score, MeanExpr, color = gene.score)) +
stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5, color = '#999999') +
stat_summary(fun = mean, geom = 'point', size = 3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE),
label.y = c(13, 12.5, 12, 15, 14.5, 14, rep(16.5, 3)), tip.length = 0.02) +
scale_colour_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + xlab('') + ylab('Mean Expression') +
ggtitle('Mean Expression') + theme_minimal() + theme(legend.position = 'none')
p2 = plot_data %>% ggplot(aes(gene.score, SDExpr, color = gene.score)) +
stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5, color = '#999999') +
stat_summary(fun = mean, geom = 'point', size = 3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE),
label.y = c(0.35, 0.325, 0.3, 0.425, 0.4, 0.375, rep(0.46, 3)), tip.length = 0.01) +
scale_colour_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + xlab('') + ylab('SD of Expression') +
ggtitle('Standard Deviation') + theme_minimal() + theme(legend.position = 'none')
grid.arrange(p1, p2, nrow = 1)
Just to corroborate that the relation between sd and SFARI score used to be in the opposite direction before the normalisation: The higher the SFARI score the higher the mean expression and the higher the standard deviation
*There are a lot of outliers, but the plot is interactive so you can zoom in
# Save preprocessed results
datExpr_prep = datExpr
datMeta_prep = datMeta
genes_info_prep = genes_info
load('./../Data/filtered_raw_data.RData')
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(genes_info, by='ID')
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
plotly::subplot(p1, p2, nrows=1)
#Return to normalised version of the data
datExpr = datExpr_prep
datMeta = datMeta_prep
genes_info = genes_info_prep
rm(plot_data, p1, p2, datExpr_prep, datMeta_prep, genes_info_prep)
The proportion of over- and under-expressed genes in each SFARI Gene score is not very different to the proportion in the genes iwth Neuronal annotations nor in the rest of the genes (good, something less to worry about)
aux = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID') %>%
dplyr::mutate(direction = ifelse(log2FoldChange>0, 'over-expressed', 'under-expressed'))
plot_data = aux %>% group_by(gene.score, direction) %>% tally(name = 'p') %>%
left_join(aux %>% group_by(gene.score) %>% tally, by = 'gene.score') %>% mutate(p = p/n, y=1)
plot_data %>% ggplot(aes(gene.score, p, fill=direction)) + geom_bar(stat='identity') +
geom_hline(yintercept = mean(plot_data$p[plot_data$direction=='under-expressed']),
linetype = 'dashed', color = 'white') +
ylab('Percentage') + xlab('SFARI Gene Scores') +
ggtitle('Direction of Fold-Change in genes by SFARI Score') + theme_minimal()
rm(aux)
There seems to be a negative relation between SFARI score and log fold change when it would be expected to be either positively correlated or independent from each other (this last one because there are other factors that determine if a gene is releated to Autism apart from differences in gene expression)
Wikipedia mentions the likely explanation for this: “A disadvantage and serious risk of using fold change in this setting is that it is biased and may misclassify differentially expressed genes with large differences (B − A) but small ratios (B/A), leading to poor identification of changes at high expression levels”.
Based on this, since we saw there is a strong relation between SFARI score and mean expression, the bias in log fold change affects mainly genes with high SFARI scores, which would be the ones we are most interested in.
On top of this, I believe this effect is made more extreme by the pattern found in the previous plots, since the higher expressed genes were the most affected by the normalisation transformation, ending up with a smaller variance than the rest of the data, which is related to smaller ratios. (This is a constant problem independently of the normalisation function used).
ggplotly(genes_info %>% ggplot(aes(x=gene.score, y=abs(log2FoldChange), fill=gene.score)) +
geom_boxplot() + scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + xlab('SFARI Gene scores') +
ylab('log Fold Change Magnitude') + theme_minimal() + theme(legend.position='none'))
truncated_sd = function(x) data.frame('y' = mean(x), 'ymin' = max(mean(x)-sd(x),0), 'ymax' = mean(x)+sd(x))
genes_info %>% ggplot(aes(gene.score, abs(log2FoldChange), color = gene.score)) +
stat_summary(fun.data = function(x) return(truncated_sd(x)), geom = 'errorbar', width = 0.5,
color = '#999999') +
stat_summary(fun = mean, geom = 'point', size = 3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE),
label.y = c(.5, .45, .4, .7, .65, .6, rep(0.8, 3)), tip.length = 0.01) +
scale_colour_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + xlab('') + ylab('LFC Magnitude') +
theme_minimal() + theme(legend.position = 'none')
rm(truncated_sd)
We know that in general there is a negative relation between mean expression and lFC in genes, and we also know that there is a strong relation between SFARI Gene Scores and the mean level of expression of the genes.
This could explain the behaviour we found above, but we want to see if, once you control for the level of expression, the SFARI genes continue to have this relation to LFC or if it dissapears. (Being optimistic, perhaps the SFARI genes actually have higher LFC than genes with similar levels of expression, but we can’t see this in the original plot because of the relation between level of expression and LFC)
plot_data = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score, significant) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID') %>%
mutate(alpha = ifelse(gene.score == 'Others' , 0.1, ifelse(gene.score == 'Neuronal', 0.3, 0.7)))
p1 = plot_data %>% ggplot(aes(gene.score, meanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + theme_minimal() +
ylab('Mean Level of Expression') + xlab('SFARI Gene Score') + theme(legend.position='none')
p2 = plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color = gene.score)) +
geom_point(alpha=plot_data$alpha) + geom_smooth(method='lm', color='#999999') +
ylab('LogFoldChange Magnitude') + xlab('Mean Expression') +
scale_color_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
theme_minimal() + theme(legend.position = 'none')
p2 = ggExtra::ggMarginal(p2, type='density',groupColour = TRUE, size=10)
grid.arrange(p2, p1, ncol=2, widths = c(0.6, 0.4))
rm(p1,p2)
plot_data = data.frame('meanExpr' = rowMeans(datExpr), 'LFC_magnitude' = abs(genes_info$log2FoldChange),
'gene.score' = genes_info$gene.score, 'p' = NA) %>% arrange(meanExpr)
w = 1000
for(i in 1:(nrow(plot_data)-w)){
plot_data$p[i+floor(w/2)] = mean(plot_data$LFC_magnitude[i:(i+w)])
}
aux_data = plot_data %>% filter(!gene.score %in% c('Neuronal','Others')) %>% group_by(gene.score) %>%
dplyr::summarise(mean_by_score = mean(meanExpr)) %>% ungroup %>%
mutate('color' = SFARI_colour_hue(r=1:6)[1:3])
ggplotly(plot_data %>% filter(!is.na(p)) %>% ggplot(aes(meanExpr, p)) + geom_line() +
xlab('Mean Level of Expression') + ylab('Sliding Average of LFC Magnitude') +
geom_vline(data = aux_data, aes(xintercept = mean_by_score), color = aux_data$color) +
ggtitle('Sliding Average of LFC Magnitude by Mean Level of Expression') + theme_minimal())
rm(aux_data)
We want to know what happens to the originally negative relation we found between SFARI Gene scores and lFC magnitude when we control for level of expression.
To do this, I’m going to compare each SFARI Gene with its closest non-SFARI neighbours following these steps:
Select one SFARI gene
Select its neighbours: 100 non-SFARI genes with the most similar mean level of Expression
Standardise the lFC magnitude of each of the neighbours and of the SFARI gene (using the mean and sd of the lFC magnitude of only these 101 genes)
Repeat this for each of the SFARI Genes, saving the standardised lFC magnitudes of all the SFARI genes and all the neighbours
Compare the distribution of this value between these two groups (SFARI and their neighbours)
This plot shows the general idea of steps 1, 2, and 3, selecting a random SFARI gene:
The plot on the left shows the original mean expression and lFC magnitude of the SFARI Gene and its 100 closest neighbours
The plot on the right shows the standardised lFC mangitude of the genes, and the vertical lines represent the value that is going to be recorded for each of this genes to be compared afterwards
n = 100
plot_data = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID')
SFARI_gene = plot_data %>% filter(gene.score %in% c('1','2','3','4','5','6')) %>% sample_n(1) %>%
mutate(d=0, alpha = 1)
nn = plot_data %>% filter(gene.score %in% c('Neuronal','Others')) %>%
mutate(d = abs(meanExpr-SFARI_gene$meanExpr), alpha=0.5) %>% top_n(n=-n, wt = d)
plot_data = rbind(SFARI_gene, nn) %>%
mutate(std_magnitude = (abs(log2FoldChange) - mean(abs(log2FoldChange)))/sd(abs(log2FoldChange)))
p1 = plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color = gene.score)) +
geom_point(alpha = plot_data$alpha) + xlab('Mean Expression') + ylab('Log2 Fold Change Magnitude') +
scale_color_manual(values=SFARI_colour_hue(r=c(as.numeric(SFARI_gene$gene.score),8,7))) +
theme_minimal() + theme(legend.position='none')
p2 = plot_data %>% ggplot(aes(meanExpr, std_magnitude, color = gene.score)) +
geom_point(alpha = plot_data$alpha) +
geom_hline(aes(yintercept = mean(std_magnitude)), linetype = 'dashed', color = '#999999') +
scale_color_manual(values=SFARI_colour_hue(r=c(as.numeric(SFARI_gene$gene.score),8,7))) +
geom_segment(aes(x=SFARI_gene$meanExpr, y=mean(std_magnitude), xend = SFARI_gene$meanExpr,
yend = std_magnitude[1]), alpha = 0.5,
color = SFARI_colour_hue(r=1:8)[as.numeric(SFARI_gene$gene.score)]) +
xlab('Mean Expression') + ylab('Standardised LFC Magnitude') +
theme_minimal() + theme(legend.position='none')
for(i in 1:15){
random_sample = plot_data %>% filter(gene.score != SFARI_gene$gene.score) %>% sample_n(1)
p2 = p2 + geom_segment(x=random_sample$meanExpr, xend = random_sample$meanExpr, y=mean(plot_data$std_magnitude),
yend = random_sample$std_magnitude, alpha = 0.5, color = 'gray')
}
grid.arrange(p1, p2, ncol=2, top='Comparing SFARI Genes with their n closest neighbours by Mean Expression')
cat(paste0('SFARI gene\'s standardised distance to its neigbours\'s LFC magnitude: ',
round(plot_data$std_magnitude[1],4)))
## SFARI gene's standardised distance to its neigbours's LFC magnitude: 0.2335
rm(p1, p2, SFARI_gene, nn, random_sample, i)
As steps 4, and 5, say, we repeat this for all of the SFARI Genes, recording their standardised mangnitude as well as the ones from their neighbours so we can study them all together
Even when controlling for the relation between Mean Expression and LFC by comparing each SFARI Gene only with neighbouring genes, we see the same results as before!
Neuronal genes have higher magnitudes of lFC than non-SFARI, non-neuronal genes consistently (makes sense)
The higher the SFARI Gene score, the lower the LFC (even with genes with the same level of expression!)
get_std_lfc_magnitudes = function(data_info, SFARI_score, n){
SFARI_genes = data_info %>% filter(gene.score == as.character(SFARI_score))
std_magnitudes = data.frame(gene.score = as.character(), std_magnitude = as.numeric)
for(i in 1:nrow(SFARI_genes)){
SFARI_gene = SFARI_genes[i,]
nn = data_info %>% filter(gene.score %in% c('Neuronal','Others')) %>%
mutate(d = abs(meanExpr-SFARI_gene$meanExpr)) %>% top_n(n=-n, wt = d) %>% dplyr::select(-d)
iter_data = rbind(SFARI_gene, nn) %>%
mutate(std_magnitude = (abs(log2FoldChange) - mean(abs(log2FoldChange)))/sd(abs(log2FoldChange))) %>%
dplyr::select(gene.score, std_magnitude)
std_magnitudes = rbind(std_magnitudes, iter_data)
}
return(std_magnitudes)
}
data_info = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID')
std_magnitudes_score_1 = get_std_lfc_magnitudes(data_info, 1, n)
std_magnitudes_score_2 = get_std_lfc_magnitudes(data_info, 2, n)
std_magnitudes_score_3 = get_std_lfc_magnitudes(data_info, 3, n)
p1 = std_magnitudes_score_1 %>% ggplot(aes(gene.score, std_magnitude, fill = gene.score)) + geom_boxplot() +
xlab('') + ylab('Standardised LFC Magnitude') + scale_fill_manual(values=SFARI_colour_hue(r=c(1,8,7))) +
theme_minimal() + theme(legend.position = 'none')
p2 = std_magnitudes_score_2 %>% ggplot(aes(gene.score, std_magnitude, fill = gene.score)) + geom_boxplot() +
xlab('') + ylab('') + scale_fill_manual(values=SFARI_colour_hue(r=c(2,8,7))) + theme_minimal() +
theme(legend.position = 'none')
p3 = std_magnitudes_score_3 %>% ggplot(aes(gene.score, std_magnitude, fill = gene.score)) + geom_boxplot() +
xlab('') + ylab('') + scale_fill_manual(values=SFARI_colour_hue(r=c(3,8,7))) + theme_minimal() +
theme(legend.position = 'none')
grid.arrange(p1,p2,p3,nrow=1,
top = 'Comparison of LFC Magnitude of SFARI gens and their closest neighbours by Mean Expression')
rm(p1,p2,p3)
p1 = std_magnitudes_score_1 %>% ggplot(aes(gene.score, std_magnitude, color = gene.score)) +
stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5, color = '#999999') +
stat_summary(fun = mean, geom = 'point', size = 3) +
stat_compare_means(comparisons = list(c('1','Neuronal'), c('1','Others')), label = 'p.signif',
method = 't.test', method.args = list(var.equal = FALSE), label.y = c(1.7, 2),
tip.length = 0.01) +
scale_colour_manual(values=SFARI_colour_hue(r=c(1,8,7))) + xlab('') + ylab('Standardised LFC Magnitude') +
theme_minimal() + theme(legend.position = 'none')
p2 = std_magnitudes_score_2 %>% ggplot(aes(gene.score, std_magnitude, color = gene.score)) +
stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5, color = '#999999') +
stat_summary(fun = mean, geom = 'point', size = 3) +
stat_compare_means(comparisons = list(c('2','Neuronal'), c('2','Others')), label = 'p.signif',
method = 't.test', method.args = list(var.equal = FALSE), label.y = c(1.7, 2),
tip.length = 0.01) +
scale_colour_manual(values=SFARI_colour_hue(r=c(2,8,7))) + xlab('') + ylab('Standardised LFC Magnitude') +
theme_minimal() + theme(legend.position = 'none')
p3 = std_magnitudes_score_3 %>% ggplot(aes(gene.score, std_magnitude, color = gene.score)) +
stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5, color = '#999999') +
stat_summary(fun = mean, geom = 'point', size = 3) +
stat_compare_means(comparisons = list(c('3','Neuronal'), c('3','Others')), label = 'p.signif',
method = 't.test', method.args = list(var.equal = FALSE), label.y = c(1.7, 2),
tip.length = 0.01) +
scale_colour_manual(values=SFARI_colour_hue(r=c(3,8,7))) + xlab('') + ylab('Standardised LFC Magnitude') +
theme_minimal() + theme(legend.position = 'none')
grid.arrange(p1, p2, p3, nrow = 1,
top = 'Comparison of LFC Magnitude of SFARI genes and their closest neighbours by Mean Expression')
rm(p1, p2, p3)
Hypothesis test to see if SFARI Genes by score and their neighbours have the same mean:
Null hypothesis: Differences in means is equal to zero
get_t_test_vals = function(std_magnitudes_score, score){
t_test_Neuronal = t.test(std_magnitude ~ gene.score,
data = std_magnitudes_score[std_magnitudes_score$gene.score != 'Others',],
var.equal=FALSE)
t_test_others = t.test(std_magnitude ~ gene.score,
data = std_magnitudes_score[std_magnitudes_score$gene.score != 'Neuronal',],
var.equal=FALSE)
neuronal = c(score, 'Neuronal', t_test_Neuronal$estimate[1][[1]],
t_test_Neuronal$estimate[2][[1]], t_test_Neuronal$p.value)
others = c(score, 'Others', t_test_others$estimate[1][[1]], t_test_others$estimate[2][[1]],
t_test_others$p.value)
return(rbind(neuronal, others))
}
results_1 = get_t_test_vals(std_magnitudes_score_1, 1)
results_2 = get_t_test_vals(std_magnitudes_score_2, 2)
results_3 = get_t_test_vals(std_magnitudes_score_3, 3)
t_test_df = rbind(results_1, results_2, results_3) %>% data.frame %>%
dplyr::rename('SFARI_score' = X1, 'Comparison' = X2, 'mean_group_1' = X3,
'mean_group_2' = X4, 'p_val' = X5) %>%
mutate(BH_p_val = p.adjust(as.numeric(as.character(p_val)), method = 'BH')) %>%
mutate(same_mean = ifelse(BH_p_val<0.05, 'No', 'Possible'))
t_test_df %>% kable
| SFARI_score | Comparison | mean_group_1 | mean_group_2 | p_val | BH_p_val | same_mean |
|---|---|---|---|---|---|---|
| 1 | Neuronal | -0.18914393994337 | 0.217825125962056 | 9.9930226669465e-07 | 0.0000060 | No |
| 1 | Others | -0.18914393994337 | -0.0253092149951543 | 0.0317784642862954 | 0.0476677 | No |
| 2 | Neuronal | -0.0544738583068819 | 0.235983847086988 | 3.35199386187546e-05 | 0.0001006 | No |
| 2 | Others | -0.0544738583068819 | -0.0229625408270816 | 0.621882657183584 | 0.6218827 | Possible |
| 3 | Neuronal | 0.0441277680655755 | 0.193780986664516 | 0.00536156586786669 | 0.0107231 | No |
| 3 | Others | 0.0441277680655755 | -0.0172259178092508 | 0.222231854169067 | 0.2666782 | Possible |
rm(std_magnitudes_score_1, std_magnitudes_score_2, std_magnitudes_score_3)
The proportion of DE genes for each SFARI Genes is quite similar
plot_info = genes_info %>% group_by(gene.score, significant) %>% tally(name = 'DE') %>% ungroup %>% ungroup %>%
left_join(genes_info %>% group_by(gene.score) %>% tally(name = 'total') %>%
ungroup, by = 'gene.score') %>% filter(significant) %>%
mutate('perc' = 100*DE/total)
ggplotly(plot_info %>% ggplot(aes(gene.score, perc, fill = gene.score)) + geom_bar(stat='identity') +
xlab('SFARI Gene Score') + ylab('% of DE genes') + theme_minimal() + theme(legend.position = 'none') +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))))
But in our dataset, the higher the level of expression of a gene, the more likely the gene is to be DE (this can be seen by ordering the genes by level of expression and calculating the proportion of DE Genes using a sliding window). Based one this, the SFARI Score 1 should have the highest proportion of DE Genes, followed by score 2 and score 3 in the end
plot_data = data.frame('meanExpr' = rowMeans(datExpr), 'DE' = genes_info$significant,
'gene.score' = genes_info$gene.score, 'p' = NA) %>% arrange(meanExpr)
w = 1000
for(i in 1:(nrow(plot_data)-w)){
plot_data$p[i+floor(w/2)] = mean(plot_data$DE[i:(i+w)])*100
}
aux_data = plot_data %>% filter(!gene.score %in% c('Neuronal','Others')) %>% group_by(gene.score) %>%
dplyr::summarise(mean_by_score = mean(meanExpr)) %>% ungroup %>%
mutate('color' = SFARI_colour_hue(r=1:6)[1:3])
ggplotly(plot_data %>% filter(!is.na(p)) %>% ggplot(aes(meanExpr, p)) + geom_line() +
xlab('Mean Level of Expression') + ylab('Sliding Percentage of DE Genes') +
geom_vline(data = aux_data, aes(xintercept = mean_by_score), color = aux_data$color) +
ggtitle('Sliding Percentage of DE Genes by Mean Level of Expression') + theme_minimal())
# aux_data = plot_data %>% group_by(gene.score) %>%
# dplyr::summarise('mean_ME' = mean(meanExpr), 'sd_ME' = sd(meanExpr)) %>% ungroup %>%
# mutate(xmin = mean_ME-sd_ME, xmax = mean_ME+sd_ME,
# ymin = min(plot_data$p, na.rm = TRUE)*0.8, ymax = max(plot_data$p, na.rm = TRUE)*1.2) %>%
# filter(gene.score %in% c('1','2','3'))
# plot_data %>% filter(!is.na(p)) %>% ggplot(aes(meanExpr, p)) + geom_line() +
# xlab('Mean Level of Expression') + ylab('% of DE Genes in Sliding Window') +
# geom_vline(xintercept = aux_data$mean_ME[aux_data$gene.score == '1'],
# color = SFARI_colour_hue(r=1:6)[1]) +
# geom_vline(xintercept = mean(plot_data$meanExpr[plot_data$gene.score=='2']),
# color = SFARI_colour_hue(r=1:6)[2]) +
# geom_vline(xintercept = mean(plot_data$meanExpr[plot_data$gene.score=='3']),
# color = SFARI_colour_hue(r=1:6)[3]) + theme_minimal() +
# annotate('rect', xmin = aux_data$xmin, xmax = aux_data$xmax, ymin = aux_data$ymin, ymax = aux_data$ymax,
# fill = SFARI_colour_hue(r=1:6)[1:3], color = 'transparent', alpha = 0.2),
# theme_minimal())
rm(aux_data)
We want to see how the different scores in the SFARI Genes compare to other groups of genes with similar levels of expression when studying the proportion of DE genes
To do this, I’m going to compare each SFARI Gene with its closest non-SFARI neighbours following these steps:
Select one SFARI gene
Select its neighbours: 100 non-SFARI genes with the most similar mean level of Expression
Calculate the % of these neighbours that are DE and store this value
Repeat this for all of the genes in a specific SFARI score: We have a distribution of % DE neighbours and a single value indicating the percentage of DE genes in that SFARI score
4.1 Measure how annomalous the value for the SFARI scores is by calculating its distance to the distribution (in standard devitions)
Notes:
The higher the SFARI Score, the farther away its percentage of DE genes is from the distribution of its neighbours’ DE Genes, both for Neuronal and Others groups
get_neighbours_DE = function(data_info, SFARI_score, n){
SFARI_genes = data_info %>% filter(gene.score == as.character(SFARI_score))
perc_DE = data.frame(gene.score = as.character(), p_DE = as.numeric)
for(i in 1:nrow(SFARI_genes)){
SFARI_gene = SFARI_genes[i,]
nn = data_info %>% filter(gene.score %in% c('Neuronal','Others')) %>%
mutate(d = abs(meanExpr-SFARI_gene$meanExpr)) %>% top_n(n=-n, wt = d) %>%
group_by(gene.score) %>% summarise(perc_DE = 100*mean(significant)) %>% ungroup
perc_DE = rbind(perc_DE, nn)
}
colnames(perc_DE) = c('gene.score', 'perc_DE')
return(perc_DE)
}
calc_dist = function(SFARI_score, df, group){
SFARI_p = 100*mean(genes_info$significant[genes_info$gene.score==SFARI_score])
mean_nn = df$perc_DE[df$gene.score == group] %>% mean
sd_nn = df$perc_DE[df$gene.score == group] %>% sd
dist = round(abs(SFARI_p-mean_nn)/sd_nn, 2)
return(dist)
}
data_info = genes_info %>% dplyr::select(ID, significant, gene.score) %>%
left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID')
n = 100
perc_DE_nn_score_1 = get_neighbours_DE(data_info, 1, n)
perc_DE_nn_score_2 = get_neighbours_DE(data_info, 2, n)
perc_DE_nn_score_3 = get_neighbours_DE(data_info, 3, n)
p1 = perc_DE_nn_score_1 %>% ggplot(aes(gene.score, perc_DE, fill = gene.score)) + geom_boxplot() +
xlab('') + ylab('% of DE Genes') +
geom_hline(yintercept = 100*mean(genes_info$significant[genes_info$gene.score=='1']),
color = SFARI_colour_hue(r=1:6)[1]) +
ggtitle(paste0('Neighbours of SFARI Score 1 Genes',
'\n\n Dist to Neuronal: ', calc_dist('1', perc_DE_nn_score_1, 'Neuronal'), ' SD',
'\n Dist to Others: ', calc_dist('1', perc_DE_nn_score_1, 'Others'), ' SD')) +
scale_fill_manual(values=SFARI_colour_hue(r=c(8,7))) + theme_minimal() + theme(legend.position = 'none')
p2 = perc_DE_nn_score_2 %>% ggplot(aes(gene.score, perc_DE, fill = gene.score)) + geom_boxplot() +
xlab('') + ylab('% of DE Genes') +
geom_hline(yintercept = 100*mean(genes_info$significant[genes_info$gene.score=='2']),
color = SFARI_colour_hue(r=1:6)[2]) +
ggtitle(paste0('Neighbours of SFARI Score 2 Genes',
'\n\n Dist to Neuronal: ', calc_dist('2', perc_DE_nn_score_2, 'Neuronal'), ' SD',
'\n Dist to Others: ', calc_dist('2', perc_DE_nn_score_2, 'Others'), ' SD')) +
scale_fill_manual(values=SFARI_colour_hue(r=c(8,7))) + theme_minimal() + theme(legend.position = 'none')
p3 = perc_DE_nn_score_3 %>% ggplot(aes(gene.score, perc_DE, fill = gene.score)) + geom_boxplot() +
xlab('') + ylab('% of DE Genes') +
geom_hline(yintercept = 100*mean(genes_info$significant[genes_info$gene.score=='3']),
color = SFARI_colour_hue(r=1:6)[3]) +
ggtitle(paste0('Neighbours of SFARI Score 3 Genes',
'\n\n Dist to Neuronal: ', calc_dist('3', perc_DE_nn_score_3, 'Neuronal'), ' SD',
'\n Dist to Others: ', calc_dist('3', perc_DE_nn_score_3, 'Others'), ' SD')) +
scale_fill_manual(values=SFARI_colour_hue(r=c(8,7))) + theme_minimal() + theme(legend.position = 'none')
grid.arrange(p1,p2,p3, nrow = 1)
ymax = max(mean(perc_DE_nn_score_1$perc_DE[perc_DE_nn_score_1$gene.score=='Neuronal'])+
sd(perc_DE_nn_score_1$perc_DE[perc_DE_nn_score_1$gene.score=='Neuronal']),
mean(perc_DE_nn_score_2$perc_DE[perc_DE_nn_score_2$gene.score=='Neuronal'])+
sd(perc_DE_nn_score_2$perc_DE[perc_DE_nn_score_2$gene.score=='Neuronal']),
mean(perc_DE_nn_score_3$perc_DE[perc_DE_nn_score_3$gene.score=='Neuronal'])+
sd(perc_DE_nn_score_3$perc_DE[perc_DE_nn_score_3$gene.score=='Neuronal']))
ymin = min(mean(perc_DE_nn_score_1$perc_DE[perc_DE_nn_score_1$gene.score=='Neuronal'])-
sd(perc_DE_nn_score_1$perc_DE[perc_DE_nn_score_1$gene.score=='Neuronal']),
mean(perc_DE_nn_score_2$perc_DE[perc_DE_nn_score_2$gene.score=='Neuronal'])-
sd(perc_DE_nn_score_2$perc_DE[perc_DE_nn_score_2$gene.score=='Neuronal']),
mean(perc_DE_nn_score_3$perc_DE[perc_DE_nn_score_3$gene.score=='Neuronal'])-
sd(perc_DE_nn_score_3$perc_DE[perc_DE_nn_score_3$gene.score=='Neuronal']))
p1 = perc_DE_nn_score_1 %>% ggplot(aes(gene.score, perc_DE, color = gene.score)) +
stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5, color = '#999999') +
stat_summary(fun = mean, geom = 'point', size = 3) +
geom_hline(yintercept = 100*mean(genes_info$significant[genes_info$gene.score=='1']),
color = SFARI_colour_hue(r=1:6)[1]) +
ggtitle(paste0('SFARI Score 1 Genes',
'\n\n Dist to Neuronal: ', calc_dist('1', perc_DE_nn_score_1, 'Neuronal'), ' SD',
'\n Dist to Others: ', calc_dist('1', perc_DE_nn_score_1, 'Others'), ' SD')) +
scale_colour_manual(values=SFARI_colour_hue(r=c(8,7))) +
xlab('Neighbours') + ylab('Percentage of DE Genes') +
coord_cartesian(ylim = c(ymin, ymax)) + theme_minimal() + theme(legend.position = 'none')
p2 = perc_DE_nn_score_2 %>% ggplot(aes(gene.score, perc_DE, color = gene.score)) +
stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5, color = '#999999') +
stat_summary(fun = mean, geom = 'point', size = 3) +
geom_hline(yintercept = 100*mean(genes_info$significant[genes_info$gene.score=='2']),
color = SFARI_colour_hue(r=1:6)[2]) +
ggtitle(paste0('SFARI Score 2 Genes',
'\n\n Dist to Neuronal: ', calc_dist('2', perc_DE_nn_score_2, 'Neuronal'), ' SD',
'\n Dist to Others: ', calc_dist('2', perc_DE_nn_score_2, 'Others'), ' SD')) +
scale_colour_manual(values=SFARI_colour_hue(r=c(8,7))) + xlab('Neighbours') + ylab('') +
coord_cartesian(ylim = c(ymin, ymax)) +
theme_minimal() + theme(legend.position = 'none')
p3 = perc_DE_nn_score_3 %>% ggplot(aes(gene.score, perc_DE, color = gene.score)) +
stat_summary(fun.data = mean_sd, geom = 'errorbar', width = 0.5, color = '#999999') +
stat_summary(fun = mean, geom = 'point', size = 3) +
geom_hline(yintercept = 100*mean(genes_info$significant[genes_info$gene.score=='3']),
color = SFARI_colour_hue(r=1:6)[3]) +
ggtitle(paste0('SFARI Score 3 Genes',
'\n\n Dist to Neuronal: ', calc_dist('3', perc_DE_nn_score_3, 'Neuronal'), ' SD',
'\n Dist to Others: ', calc_dist('3', perc_DE_nn_score_3, 'Others'), ' SD')) +
scale_colour_manual(values=SFARI_colour_hue(r=c(8,7))) + xlab('Neighbours') + ylab('') +
coord_cartesian(ylim = c(ymin, ymax)) + theme_minimal() + theme(legend.position = 'none')
grid.arrange(p1,p2,p3, nrow = 1)
rm(get_neighbours_DE, calc_dist, data_info, n, perc_DE_nn_score_1, perc_DE_nn_score_2, perc_DE_nn_score_3)
The % of DE Genes is very similar for all 3 scores and for genes with no annotation
All SFARI scores except are more affected by the filtering than the genes with Neuronal annotations
Using the null hypothesis \(H_0: lfc=0\), 244/789 SFARI genes are statistically significant (31%)
lfc_list = seq(1, 1.2, 0.01)
all_counts = data.frame('group'='All', 'n'=as.character(nrow(genes_info)))
Others_counts = data.frame('group'='Others', n=as.character(sum(genes_info$gene.score == 'Others')))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(genes_info$Neuronal)))
lfc_counts_all = genes_info %>% group_by(`gene-score`) %>% filter(`gene-score` != 'Others') %>% tally %>%
mutate('group'=as.factor(`gene-score`), 'n'=as.character(n)) %>%
dplyr::select(group, n) %>%
bind_rows(Neuronal_counts, Others_counts, all_counts) %>%
mutate('lfc'=-1) %>% dplyr::select(lfc, group, n)
for(lfc in lfc_list){
# Recalculate genes_info with the new threshold (p-values change)
DE_genes = results(dds, lfcThreshold=log2(lfc), altHypothesis='greaterAbs') %>% data.frame
DE_genes = DE_genes %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'Others', `gene-score`)) %>%
distinct(ID, .keep_all = TRUE) %>% left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='Others' & Neuronal==1, 'Neuronal', `gene-score`))
DE_genes = DE_genes %>% filter(padj<0.05 & abs(log2FoldChange)>log2(lfc))
# Calculate counts by groups
all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_genes)))
Others_counts = data.frame('group'='Others', n=as.character(sum(DE_genes$`gene-score` == 'Others')))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_genes$Neuronal)))
lfc_counts = DE_genes %>% group_by(`gene-score`) %>% tally %>%
mutate('group'=`gene-score`, 'n'=as.character(n)) %>%
bind_rows(Neuronal_counts, all_counts) %>%
mutate('lfc'=lfc) %>% dplyr::select(lfc, group, n)
# Update lfc_counts_all
lfc_counts_all = lfc_counts_all %>% bind_rows(lfc_counts)
}
# Add missing entries with 0s
lfc_counts_all = expand.grid('group'=unique(lfc_counts_all$group), 'lfc'=unique(lfc_counts_all$lfc)) %>%
left_join(lfc_counts_all, by=c('group','lfc')) %>% replace(is.na(.), 0)
# Calculate percentage of each group remaining
tot_counts = genes_info %>% group_by(`gene-score`) %>% tally() %>% filter(`gene-score`!='Others') %>%
mutate('group'=`gene-score`, 'tot'=n) %>% dplyr::select(group, tot) %>%
bind_rows(data.frame('group'='Neuronal', 'tot'=sum(genes_info$Neuronal)),
data.frame('group'='Others', 'tot'=sum(genes_info$gene.score=='Others'&!genes_info$Neuronal)),
data.frame('group'='All', 'tot'=nrow(genes_info)))
lfc_counts_all = lfc_counts_all %>% filter(lfc!=-1) %>% #, group!='Others') %>%
left_join(tot_counts, by='group') %>% mutate('perc'=round(100*as.numeric(n)/tot,2))
# Plot change of number of genes
ggplotly(lfc_counts_all %>% filter(group != 'All') %>% ggplot(aes(lfc, perc, color=group)) +
geom_point(aes(id=n)) + geom_line() + scale_color_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
ylab('% of Differentially Expressed Genes') + xlab('Fold Change') +
ggtitle('Effect of filtering thresholds by SFARI score') + theme_minimal())
rm(lfc_list, all_counts, Neuronal_counts, lfc_counts_all, lfc, lfc_counts, lfc_counts_all, tot_counts,
lfc_counts_all, Others_counts)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] knitr_1.28 expss_0.10.2
## [3] DESeq2_1.24.0 SummarizedExperiment_1.14.1
## [5] DelayedArray_0.10.0 BiocParallel_1.18.1
## [7] matrixStats_0.56.0 Biobase_2.44.0
## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0
## [11] IRanges_2.18.3 S4Vectors_0.22.1
## [13] BiocGenerics_0.30.0 ClusterR_1.2.1
## [15] gtools_3.8.2 Rtsne_0.15
## [17] ggpubr_0.2.5 magrittr_1.5
## [19] GGally_1.5.0 gridExtra_2.3
## [21] viridis_0.5.1 viridisLite_0.3.0
## [23] RColorBrewer_1.1-2 plotlyutils_0.0.0.9000
## [25] plotly_4.9.2 glue_1.3.2
## [27] reshape2_1.4.3 forcats_0.5.0
## [29] stringr_1.4.0 dplyr_0.8.5
## [31] purrr_0.3.3 readr_1.3.1
## [33] tidyr_1.0.2 tibble_3.0.0
## [35] ggplot2_3.3.0 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.5 Hmisc_4.4-0
## [4] plyr_1.8.6 lazyeval_0.2.2 splines_3.6.3
## [7] gmp_0.5-13.6 crosstalk_1.1.0.1 digest_0.6.25
## [10] htmltools_0.4.0 fansi_0.4.1 checkmate_2.0.0
## [13] memoise_1.1.0 cluster_2.1.0 annotate_1.62.0
## [16] modelr_0.1.6 jpeg_0.1-8.1 colorspace_1.4-1
## [19] blob_1.2.1 rvest_0.3.5 haven_2.2.0
## [22] xfun_0.12 crayon_1.3.4 RCurl_1.98-1.1
## [25] jsonlite_1.6.1 genefilter_1.66.0 survival_3.1-12
## [28] gtable_0.3.0 zlibbioc_1.30.0 XVector_0.24.0
## [31] scales_1.1.0 DBI_1.1.0 miniUI_0.1.1.1
## [34] Rcpp_1.0.4.6 xtable_1.8-4 htmlTable_1.13.3
## [37] foreign_0.8-76 bit_1.1-15.2 Formula_1.2-3
## [40] htmlwidgets_1.5.1 httr_1.4.1 acepack_1.4.1
## [43] ellipsis_0.3.0 pkgconfig_2.0.3 reshape_0.8.8
## [46] XML_3.99-0.3 farver_2.0.3 nnet_7.3-14
## [49] dbplyr_1.4.2 locfit_1.5-9.4 later_1.0.0
## [52] tidyselect_1.0.0 labeling_0.3 rlang_0.4.5
## [55] AnnotationDbi_1.46.1 munsell_0.5.0 cellranger_1.1.0
## [58] tools_3.6.3 cli_2.0.2 generics_0.0.2
## [61] RSQLite_2.2.0 broom_0.5.5 fastmap_1.0.1
## [64] evaluate_0.14 yaml_2.2.1 bit64_0.9-7
## [67] fs_1.4.0 nlme_3.1-147 mime_0.9
## [70] ggExtra_0.9 xml2_1.2.5 compiler_3.6.3
## [73] rstudioapi_0.11 png_0.1-7 ggsignif_0.6.0
## [76] reprex_0.3.0 geneplotter_1.62.0 stringi_1.4.6
## [79] highr_0.8 lattice_0.20-41 Matrix_1.2-18
## [82] vctrs_0.2.4 pillar_1.4.3 lifecycle_0.2.0
## [85] data.table_1.12.8 bitops_1.0-6 httpuv_1.5.2
## [88] R6_2.4.1 latticeExtra_0.6-29 promises_1.1.0
## [91] assertthat_0.2.1 withr_2.1.2 GenomeInfoDbData_1.2.1
## [94] mgcv_1.8-31 hms_0.5.3 grid_3.6.3
## [97] rpart_4.1-15 rmarkdown_2.1 shiny_1.4.0.2
## [100] lubridate_1.7.4 base64enc_0.1-3